Row / column missing patterns

  • Do all rows / columns have the same percentage of missing values?
  • Are there correlations between missing rows / columns? (If a value is missing in one column it is likely to be missing in another column.)

Sample dataset with missing values

library(tidyverse)
# Add NAs to mtcars dataset
set.seed(5702)
mycars <- mtcars
mycars[,"gear"] <- NA
mycars[10:20, 3:5] <- NA
for (i in 1:10) mycars[sample(32,1), sample(11,1)] <- NA

Row / column missing patterns

Missing values by column

colSums(is.na(mycars)) %>%
  sort(decreasing = TRUE)
## gear disp   hp drat   am  cyl qsec   vs  mpg   wt carb 
##   32   12   12   11    3    1    1    1    0    0    0

Row / column missing patterns

Missing values by row

rowSums(is.na(mycars)) %>%
  sort(decreasing = TRUE)
##          Merc 450SE         Merc 450SLC Lincoln Continental            Fiat 128 
##                   5                   5                   5                   5 
##            Merc 280           Merc 280C          Merc 450SL  Cadillac Fleetwood 
##                   4                   4                   4                   4 
##   Chrysler Imperial         Honda Civic      Toyota Corolla        Lotus Europa 
##                   4                   4                   4                   3 
##         AMC Javelin           Fiat X1-9           Mazda RX4       Mazda RX4 Wag 
##                   2                   2                   1                   1 
##          Datsun 710      Hornet 4 Drive   Hornet Sportabout             Valiant 
##                   1                   1                   1                   1 
##          Duster 360           Merc 240D            Merc 230       Toyota Corona 
##                   1                   1                   1                   1 
##    Dodge Challenger          Camaro Z28    Pontiac Firebird       Porsche 914-2 
##                   1                   1                   1                   1 
##      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
##                   1                   1                   1                   1

Row / column missing patterns

heatmap

geom_tile()

library(tidyverse)

tidycars <- mycars %>% 
    rownames_to_column("id") %>% 
    gather(key, value, -id) %>% 
    mutate(missing = ifelse(is.na(value), "yes", "no"))
ggplot(tidycars, aes(x = key, y = fct_rev(id), fill = missing)) +
  geom_tile(color = "white") + 
  ggtitle("mtcars with NAs added") +
  scale_fill_viridis_d() + # discrete scale
  theme_bw()

Row / column missing patterns

heatmap

mi::missing_data.frame()

library(mi)
x <- missing_data.frame(mycars)
image(x)

(gear not shown since all are missing)

Row / column missing patterns

Missing values by variable

geom_tile()

ggplot(tidycars, aes(x = key, y = fct_rev(id), fill = value)) +
  geom_tile(color = "white") + 
  scale_fill_gradient(low = "grey80", high = "red", na.value = "black") + theme_bw()

Row / column missing patterns

Missing values by variable

geom_tile() with standardized variables

tidycars <- tidycars %>% group_by(key) %>% 
  mutate(Std = (value-mean(value, na.rm = TRUE))/sd(value, na.rm = TRUE)) %>% ungroup()

ggplot(tidycars, aes(x = key, y = fct_rev(id), fill = Std)) +
  geom_tile(color = "white") + 
  scale_fill_gradient2(low = "blue", mid = "white", high ="yellow", na.value = "black") + theme_bw()

Row / column missing patterns

Missing values by variable

reordered by number of missing

# convert missing to numeric so it can be summed up
tidycars <- tidycars %>% 
  mutate(missing2 = ifelse(missing == "yes", 1, 0))

ggplot(tidycars, aes(x = fct_reorder(key, -missing2, sum), y = fct_reorder(id, -missing2, sum), fill = Std)) +
  geom_tile(color = "white") + 
  scale_fill_gradient2(low = "blue", mid = "white", high ="yellow", na.value = "black") + theme_bw()

Row / column missing patterns

Missing values by variable

Row / column missing patterns

Missing values by variable

missing patterns

x <- mi::missing_data.frame(mycars)
## NOTE: In the following pairs of variables, the missingness pattern of the second is a subset of the first.
##  Please verify whether they are in fact logically distinct variables.
##      [,1]   [,2]  
## [1,] "disp" "drat"
## [2,] "disp" "qsec"
## [3,] "hp"   "drat"
## [4,] "hp"   "qsec"
## [5,] "hp"   "am"  
## [6,] "drat" "qsec"
class(x)
## [1] "missing_data.frame"
## attr(,"package")
## [1] "mi"
x@patterns
##  [1] nothing              nothing              nothing             
##  [4] nothing              nothing              nothing             
##  [7] nothing              nothing              nothing             
## [10] disp, hp, drat       disp, hp, drat       disp, hp, drat, am  
## [13] disp, hp, drat       disp, hp, drat, qsec disp, hp, drat      
## [16] disp, hp, drat, am   disp, hp, drat       cyl, disp, hp, drat 
## [19] disp, hp, drat       disp, hp, drat       nothing             
## [22] nothing              vs                   nothing             
## [25] nothing              disp                 nothing             
## [28] hp, am               nothing              nothing             
## [31] nothing              nothing             
## 8 Levels: nothing vs disp hp, am disp, hp, drat ... cyl, disp, hp, drat
levels(x@patterns)
## [1] "nothing"              "vs"                   "disp"                
## [4] "hp, am"               "disp, hp, drat"       "disp, hp, drat, am"  
## [7] "disp, hp, drat, qsec" "cyl, disp, hp, drat"
summary(x@patterns)
##              nothing                   vs                 disp 
##                   18                    1                    1 
##               hp, am       disp, hp, drat   disp, hp, drat, am 
##                    1                    7                    2 
## disp, hp, drat, qsec  cyl, disp, hp, drat 
##                    1                    1

Aggregated missing patterns

(repeated patterns are reduced to one row)

# note: extracat is no longer available
library(extracat)
visna(mycars)

Aggregated missing patterns

Sorted by most common to least common missing pattern (top to bottom)

visna(mycars, sort = "r")

Aggregated missing patterns

Sorted by variable with the most to least missing values (left to right)

visna(mycars, sort = "c")

Aggregated missing patterns

visna(mycars, sort = "b")

NYC School data

df <- read.csv("SAT2010.csv", na.strings = "s",
               check.names = FALSE)
head(df)
##      DBN                                    School Name Number of Test Takers
## 1 01M292 Henry Street School for International Studies                     31
## 2 01M448           University Neighborhood High School                     60
## 3 01M450               East Side Community High School                     69
## 4 01M458                  SATELLITE ACADEMY FORSYTH ST                     26
## 5 01M509                              CMSP HIGH SCHOOL                     NA
## 6 01M515       Lower East Side Preparatory High School                    154
##   Critical Reading Mean Mathematics Mean Writing Mean
## 1                   391              425          385
## 2                   394              419          387
## 3                   418              431          402
## 4                   385              370          378
## 5                    NA               NA           NA
## 6                   314              532          314
dim(df)
## [1] 460   6

NYC School data

visna(df)

Value patterns in missing data

  • Are missing patterns correlated with values of another variable?
  • Are certain value ranges more likely to be missing?
  • Or are there no patterns at all?
  • (An explanation may not be available within the dataset.)

NYC School data

Does the proportion of schools with missing data vary by borough?

Data: SAT2010.csv

head(df)
##      DBN                                    School Name Number of Test Takers
## 1 01M292 Henry Street School for International Studies                     31
## 2 01M448           University Neighborhood High School                     60
## 3 01M450               East Side Community High School                     69
## 4 01M458                  SATELLITE ACADEMY FORSYTH ST                     26
## 5 01M509                              CMSP HIGH SCHOOL                     NA
## 6 01M515       Lower East Side Preparatory High School                    154
##   Critical Reading Mean Mathematics Mean Writing Mean
## 1                   391              425          385
## 2                   394              419          387
## 3                   418              431          402
## 4                   385              370          378
## 5                    NA               NA           NA
## 6                   314              532          314

Missing by borough

df <- df %>% mutate(Borough = str_sub(DBN, 3, 3))
percent_missing <- df %>% group_by(Borough) %>% 
  summarize(num_schools = n(), num_na = sum(is.na(`Writing Mean`))) %>% 
  mutate(percent_na = round(num_na/num_schools, 2)) %>% 
  arrange(-percent_na)
percent_missing
## # A tibble: 5 × 4
##   Borough num_schools num_na percent_na
##   <chr>         <int>  <int>      <dbl>
## 1 K               141     32       0.23
## 2 Q                73     14       0.19
## 3 M               111     13       0.12
## 4 X               124     14       0.11
## 5 R                11      1       0.09

Missing by borough

K, Q, M, X, R

Manhattan

Brooklyn

Queens

The Bronx

Staten Island

Missing by borough

K, Q, M, X, R

Manhattan = New York County

Brooklyn = Kings County

Queens = Queens County

The Bronx = Bronx County

Staten Island = Richmond County

Missing by borough

K, Q, M, X, R

Manhattan = New York County = “M”

Brooklyn = Kings County = “K”

Queens = Queens County = “Q”

The Bronx = Bronx County = “X”

Staten Island = Richmond County = “R”

Missing by borough

percent_missing <- percent_missing %>%
  mutate(BoroughName = fct_recode(Borough,
                                      Manhattan = "M",
                                      Brooklyn = "K", 
                                      Queens = "Q", 
                                      `The Bronx` = "X",
                                      `Staten Island` = "R"))
percent_missing
## # A tibble: 5 × 5
##   Borough num_schools num_na percent_na BoroughName  
##   <chr>         <int>  <int>      <dbl> <fct>        
## 1 K               141     32       0.23 Brooklyn     
## 2 Q                73     14       0.19 Queens       
## 3 M               111     13       0.12 Manhattan    
## 4 X               124     14       0.11 The Bronx    
## 5 R                11      1       0.09 Staten Island

Missing by borough

df <- df %>% 
  mutate(BoroughName = fct_recode(Borough, 
                                      Manhattan = "M",
                                      Brooklyn = "K", 
                                      Queens = "Q", 
                                      `The Bronx` = "X",
                                      `Staten Island` = "R"))

dfsum <- df %>% group_by(BoroughName) %>% 
  summarize(Reading = round(mean(`Critical Reading Mean`, na.rm = TRUE), 1), 
            Math = round(mean(`Mathematics Mean`, na.rm = TRUE), 1),
            Writing = round(mean(`Writing Mean`, na.rm = TRUE), 1)) %>%
  left_join(percent_missing %>% select(BoroughName, percent_na), 
            by = "BoroughName") %>% 
  arrange(desc(percent_na))

Missing by borough and scores

dfsumtidy <-  dfsum %>% pivot_longer(cols = Reading:Writing, 
                                     names_to = "subject",
                                     values_to = "meanscore")
ggplot(dfsumtidy, aes(meanscore, percent_na, color = BoroughName)) + geom_point(size = 2) + facet_wrap(~subject) + theme_bw() +
  theme(legend.position = "bottom")

EXAMPLE: Snowfall

I need to know how much snow fell per day in New York State in February 2017, on a county or more detailed level. I know some government agency measures and reports snowfall and puts the data online.

Snowfall

I need to know how much snow fell per day in New York State in February 2017, on a county or more detailed level. I know some government agency measures and reports snowfall and puts the data online.

Source: https://www.ncdc.noaa.gov/snow-and-ice/daily-snow/NY/snowfall/20170201

Accessed: 2017-10-26

library(tidyverse)

# https://www.ncdc.noaa.gov/snow-and-ice/daily-snow/
# Accessed 2017-10-26
# Screenshot 2017-10-26 18.45.11.png
snow <- read_csv("NY-snowfall-201702.csv",
                 skip = 1,
                 na = "M")

# replace "T" (trace) with .005

trace_replace <- function(x) as.numeric(ifelse(x == "T", .005, x))

snow2 <- snow %>%
  mutate_at(vars(`Feb 1`:`Feb 28`), trace_replace)


# note: extracat is no longer available
extracat::visna(snow, sort = "r")

# note: extracat is no longer available
extracat::visna(snow, sort = "c")

# note: extracat is no longer available
extracat::visna(snow, sort = "b")

# tidy snow data frame, add missing (yes or no) column

tidysnow <- snow %>% 
  select(`Station Name`, County, `Feb 1`:`Feb 28`) %>%
  gather(key, value, -`Station Name`, -County) %>%
  mutate(missing = ifelse(is.na(value), "yes", "no"))

# add a zero to single digit dates so they'll sort
# correctly (Feb 1 becomes Feb 01) 
# 
tidysnow$key <- ifelse(nchar(tidysnow$key) == 5,
                       gsub(" ", " 0", tidysnow$key),
                       tidysnow$key)


ggplot(tidysnow, aes(x = key, y = `Station Name`, fill = missing)) + 
  geom_raster() + scale_fill_manual(values = c("white", "red")) + scale_x_discrete("February 2017", labels = 1:28) + theme_bw()

Number missing by county mean

theme_dotplot <- theme_bw(16) +
  theme(axis.ticks.y = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_line(size = 0.5),
        panel.grid.minor.x = element_blank())

na_by_station <- tidysnow %>% 
  group_by(County, `Station Name`) %>% 
  summarize(count = sum(is.na(value))) %>% 
  mutate(type = "station")

mean_na_by_county <- na_by_station %>% 
  group_by(County) %>%
  summarize(count = mean(count)) %>% 
  add_column(`Station Name` = "mean", .before = "count") %>% 
  mutate(type = "mean")

combined_na <- bind_rows(na_by_station, mean_na_by_county)

ggplot(combined_na, aes(x = count, 
                        y = reorder(County, count, mean), 
                        color = type)) + geom_point() +
  xlab("Number of missing values (out of 28)") +
  theme_dotplot + 
  theme(axis.text.y = element_text(size = rel(.5)))

Number missing by day of month

# change dates to YYYY-MM-DD format
tidysnow$key <- gsub("Feb ", "2017-02-", tidysnow$key)

# calculate number missing by day
missing <- tidysnow %>% 
    group_by(key) %>% 
    summarise(sum.na = sum(is.na(value)))

# plot number missing by day
ggplot(missing, aes(x = 1:28, y = sum.na)) +
  geom_col(color = "blue", fill = "lightblue") +
  scale_x_continuous(breaks = 1:28, labels = missing$key) +
  ggtitle("Number of missing values by day") +
  xlab("") +
  ylab("Number of missing station values (out of 349)") +
  theme(axis.text.x = element_text(angle = 90))

# add day of week info
missing <- missing %>% 
    mutate(dayofweek = weekdays(as.Date(key),
                                abbreviate = FALSE))
# correct day order
daysinorder <- c("Monday", "Tuesday", "Wednesday",
                 "Thursday", "Friday", "Saturday",
                 "Sunday")

# reorder dayofweek
missing$dayofweek <- factor(missing$dayofweek,
                            levels = daysinorder)
# choose colors
daycolors <- c(rep("#cbc9e2", 5), rep("#2b8cbe", 2))

# plot missing values by day, weekday/weekend colors
ggplot(missing, aes(x = key, y = sum.na, fill = dayofweek)) +
    geom_col() +
    ggtitle("Number of missing values by day") +
    scale_fill_manual(values = daycolors) +
    xlab("") +
  ylab("Number of missing station values (out of 349)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90))

Number of missing values per day by total snowfall

# calculate mean snowfall by day
meansnow <- tidysnow %>% 
    filter(!is.na(value) & (value != "T")) %>% 
    mutate(value = as.numeric(value)) %>% 
    group_by(key) %>% 
    summarise(avg = mean(value))

# calculate average snowfall per station by day
infodf <- right_join(missing, meansnow, by = "key") 

# scatterplot of number missing by average snowfall
# What is the pattern?

g <- ggplot(infodf, aes(x = avg, y = sum.na)) +
    geom_point() +
  xlab("Mean statewide snowfall (in inches)") +
  ylab("Number of missing station values (out of 349)")
g

no gridlines

g + theme_classic()